#!/usr/bin/python
# encoding=utf8

import urllib
import urllib2
from bs4 import BeautifulSoup
import sqlite3
from time import sleep
from bs4 import BeautifulSoup
import time
import datetime
from random import randint
import os
import csv
import requests
import re

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select

## START DATE ##
# start_date = datetime.datetime.strptime("01 01 1992", "%m %d %Y")
start_date = datetime.datetime.strptime("06 01 2016", "%d %m %Y")
## - ##
it_months = {
'gennaio':'01',
'febbraio':'02',
'marzo':'03',
'aprile':'04',
'maggio':'05',
'giugno':'06',
'luglio':'07',
'agosto':'08',
'settembre':'09',
'ottobre':'10',
'novembre':'11',
'dicembre':'12',
}

## FUN ##

## NOTE: All search are exact match

def createArticleTable(db, source, search_terms, table_name):

    conn = sqlite3.connect(db)
    tb_exists = "SELECT name FROM sqlite_master WHERE type='table' AND name = '%s'" % (table_name)
    
    if not conn.execute(tb_exists).fetchone():
        try:
            cursor = conn.cursor()
            sql1 = """CREATE TABLE %s (
            url CHAR PRIMARY KEY,
            date DATETIME,
            title1 CHAR,
            title2 CHAR,
            title3 CHAR,
            author CHAR,
            page_number CHAR,
            body TEXT,
            source CHAR DEFAULT "%s",
            search_terms CHAR DEFAULT "%s",
            timestamp DATETIME DEFAULT (CURRENT_TIMESTAMP)
            );""" % (table_name, source, ' '.join(search_terms))
            cursor.execute(sql1)

            conn.commit()

        except Exception,e:
            print str(e)

    return

def parseRepArchiveUrls (db, index, query_mode = 'phrase'):

    source = 'repubblica'

    table_name = getTableName(db, index, source)
    search_terms = getTerms(db, index)

    query_table_name = getTableName(db, index, source)
    table_name = query_table_name + '_articles'
    
    search_terms = getTerms(db, index)

    createArticleTable(db, source, search_terms, table_name)

    query_list = queryDbForDatesWithArticles(db, query_table_name)

    try: 

        base_url = "http://ricerca.repubblica.it/ricerca/repubblica-extra?query="

        for week in query_list:

            print("Searching '" + ' '.join(search_terms) + "' in " + source + " from " + week['from_date'])

            url = base_url + '+'.join(search_terms) + '&fromdate=' + week['from_date'] + '&todate=' + week['to_date'] + '&sortby=score&author=&mode=' + query_mode

            while True: 

                response = urllib2.urlopen(url)
                the_page = response.read()
                soup = BeautifulSoup(the_page)

                for article in soup.findAll("article"):
                    article_url = article.find("a", { "title" : "Leggi l'articolo"})['href']
                    print('Parsing ' + article_url + '...')
                    page_number = article.find("span", {"class" : "pages"}).text + " - " + article.find("span", {"class" : "section"}).text
                    page_number = page_number
                    parsed_dict = parseRepArticle(article_url, page_number)

                    if parsed_dict is None:
                        print("Coudn't parse article. Moving to next")
                        next
                    
                    enterArticle(db, table_name, parsed_dict)
                    print('Done...\n')

                    sleep(randint(20,30))

                sleep(randint(20,30))

                successiva = soup.find('a', text = 'Successiva')
                if successiva is not None:
                    url =  successiva['href']
                    print('Going to next page...\n')
                else:
                    break
                   

    except Exception, e:
            print str(e)
            return

    return
                


def parseCorArchiveUrls (db, index, query_mode = 'phrase'):

    if query_mode == 'any':
        query_mode = 'simpleany'

    if query_mode == 'all':
        query_mode = 'simpleall'

    source = 'corriere'

    query_table_name = getTableName(db, index, source)
    table_name = query_table_name + '_articles'
    
    search_terms = getTerms(db, index)

    createArticleTable(db, source, search_terms, table_name)

    query_list = queryDbForDatesWithArticles(db, query_table_name)

    try:

        url = 'http://sitesearch.corriere.it/archivioStoricoEngine.action'

        for week in query_list:

            print("Searching '" + ' '.join(search_terms) + "' in " + source + " from " + week['from_date'])

            dateSplitDict = {'fromDay' : week.get('from_date').split('-')[2],
                    'fromMonth' : week.get('from_date').split('-')[1],
                    'fromYear' : week.get('from_date').split('-')[0],
                    'toDay' : week.get('to_date').split('-')[2],
                    'toMonth' : week.get('to_date').split('-')[1],
                    'toYear' : week.get('to_date').split('-')[0]}

            values = {'queryString' : ' '.join(search_terms),
                      'fromDay' : dateSplitDict['fromDay'],
                      'fromMonth' : dateSplitDict['fromMonth'],
                      'fromYear' : dateSplitDict['fromYear'],
                      'toDay' : dateSplitDict['toDay'],
                      'toMonth' : dateSplitDict['toMonth'],
                      'toYear' : dateSplitDict['toYear'],
                      'sectionCorriere' : 'true',
                      'queryMode' : query_mode}
 
            data = urllib.urlencode(values)
            req = urllib2.Request(url, data)
            response = urllib2.urlopen(req)
            the_page = response.read()
            soup = BeautifulSoup(the_page)

            titoli_risultati = soup.find("div", { "id" : "titoli-risultati"})

            for a in titoli_risultati.findAll('a'):
                article_url = a['href']
                print('Parsing ' + article_url + '...')
                parsed_dict = parseCorArticle(article_url)
                enterArticle(db, table_name, parsed_dict)
                print('Done...\n')

                sleep(randint(20,30))

            sleep(randint(20,30))

    except Exception, e:
            print str(e)
            return

    return

def enterArticle (db, table_name, parsed_dict):

    try:
        conn = sqlite3.connect(db)
        cursor = conn.cursor()
        sql1 = '''INSERT OR IGNORE INTO %s (url, date, title1, title2, title3, author, page_number, body) VALUES ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")''' % (table_name, parsed_dict['url'], parsed_dict['date'], parsed_dict['title1'].replace('"',''), parsed_dict['title2'].replace('"',''), parsed_dict['title3'].replace('"',''), parsed_dict['author'], parsed_dict['page_number'], parsed_dict['body'].replace('"',''))
        cursor.execute(sql1)
        conn.commit()
    except Exception, e:
        print str(e)

    return
            

def queryDbForDatesWithArticles(db, table_name):

    try:
        conn = sqlite3.connect(db)
        cursor = conn.cursor()
        sql1 = "SELECT from_date, to_date FROM %s WHERE hits > 0 ORDER BY from_date ASC" % table_name
        cursor.execute(sql1)
    except Exception,e:
        print str(e)


    query_list = []
    
    for (from_date, to_date) in cursor.fetchall():
        query_list.append({'from_date':from_date.encode("utf-8"),
                           'to_date':to_date.encode("utf-8")})

    return(query_list)

    
def parseRepArticle (url, page_number):

    attempts = 0
    
    while attempts < 3:
        html_doc = requests.get(url)
        soup = BeautifulSoup(html_doc.text)
        article = soup.find('article')

        if article is not None:
            break
        else:
            attempts += 1
            sleep(randint(10,20))

    if article is None:
        return

    # Date
    year = url.split('/')[6]
    month = url.split('/')[7]
    day = url.split('/')[8]
    date = year + '-' + month + '-' + day

    title1 = article.h1
    if title1 is not None:
        title1 = title1.text.strip()

    title2 = ''
    title3 = ''
    author = ''

    # Parse article
    body = ''
    for p in article.findAll('p'):
        body = body + p.text

    return({'url' : url, 'date' : date,
            'title1' : title1, 'title2' : title2, 'title3' : title3,
            'page_number' : page_number, 'author' : author, 'body' : body})

    

def parseCorArticle (url):

    html_doc = requests.get(url)
    soup = BeautifulSoup(html_doc.text)

    # Date
    year = url.split('/')[3]
    month = url.split('/')[4]
    # Convert into number
    pattern = re.compile(r'\b(' + '|'.join(it_months.keys()) + r')\b')
    month = pattern.sub(lambda x: it_months[x.group()], month)
    day = url.split('/')[5]
    date = year + '-' + month + '-' + day
    
    title1 = soup.h1
    if title1 is not None:
        title1 = title1.text

    title2 = soup.findAll('h2')
    if title2 is not None:
        if len(title2) > 1:
            title2 = title2[1].text
        else:
            title2 = title2[0].text

    title3 = soup.h3
    if title3 is not None:
        title3 = title3.text

    page_number = soup.findAll("p", { "class" : "footnotes-archivio" })
    if page_number is not None:
        page_number = page_number[0].text.split('(')[0]

    author = soup.findAll("p", { "class" : "footnotes" })
    if author is not None:
        author = author[0].text

    # Parse article
    body = ''
    div_article = soup.find("div", { "class" : "article" })
    if div_article is not None:
        ps = div_article.findAll("p")
        if len(ps) > 1:
            body = ps[1].text

    
    return({'url' : url, 'date' : date,
            'title1' : title1, 'title2' : title2, 'title3' : title3,
            'page_number' : page_number, 'author' : author, 'body' : body})


def getTerms(db, index):

    try:
        conn = sqlite3.connect(db)
        cursor = conn.cursor()
        sql1 = "SELECT terms FROM search_term WHERE term_id = '%s'" % index
        cursor.execute(sql1)

        result = cursor.fetchone()
        return result[0].split()

    except Exception,e:
        print str(e)

def getTableName(db, index, source):

    try:
        conn = sqlite3.connect(db)
        cursor = conn.cursor()
        sql1 = "SELECT table_name FROM search_term WHERE term_id = '%s'" % index
        cursor.execute(sql1)

        result = cursor.fetchone()
        return source + '_' + str(result[0])

    except Exception,e:
        print str(e)

def termsIsPresent(db, from_date, table_name):

     try:
        conn = sqlite3.connect(db)
        cursor = conn.cursor()
        sql1 = "SELECT search_terms FROM %s WHERE from_date = '%s'" % (table_name, from_date)
        cursor.execute(sql1)

        result = cursor.fetchone()
        return result[0].split()

     except Exception,e:
        print str(e)
        

            
def searchRepubblica(db, driver, index, query_mode = 'phrase'):

    source = 'repubblica'

    table_name = getTableName(db, index, source)
    search_terms = getTerms(db, index)

    createTable(db, start_date, source, search_terms, table_name)

    query_list = queryDbForMissingDates(db, table_name)

    try: 

        base_url = "http://ricerca.repubblica.it/ricerca/repubblica-extra?query="

        for week in query_list:

            result = termsIsPresent(db, week['from_date'], table_name)

            if len(result)>0:
                search_terms = result

            print("Searching '" + ' '.join(search_terms) + "' in " + source + " from " + week['from_date'])

            url = base_url + '+'.join(search_terms) + '&fromdate=' + week['from_date'] + '&todate=' + week['to_date'] + '&sortby=score&author=&mode=' + query_mode

            driver.get(url)

            hits = driver.find_element_by_xpath("//*[@id = 'n-risultati']/strong").text.encode('utf-8')

            print('Hits: ' + hits)

            enterResult(db, hits, table_name, week['from_date'])

            sleep(randint(10,20))

    except Exception, e:
        print str(e)
        return

          

def searchCorriere(db, index, query_mode = 'phrase'):

    if query_mode == 'any':
        query_mode = 'simpleany'

    if query_mode == 'all':
        query_mode = 'simpleall'

    source = 'corriere'

    table_name = getTableName(db, index, source)
    search_terms = getTerms(db, index)

    createTable(db, start_date, source, search_terms, table_name)

    query_list = queryDbForMissingDates(db, table_name)

    try:

        url = 'http://sitesearch.corriere.it/archivioStoricoEngine.action'

        for week in query_list:

            result = termsIsPresent(db, week['from_date'], table_name)

            if len(result)>0:
                search_terms = result

            print("Searching '" + ' '.join(search_terms) + "' in " + source + " from " + week['from_date'])

            dateSplitDict = {'fromDay' : week.get('from_date').split('-')[2],
                    'fromMonth' : week.get('from_date').split('-')[1],
                    'fromYear' : week.get('from_date').split('-')[0],
                    'toDay' : week.get('to_date').split('-')[2],
                    'toMonth' : week.get('to_date').split('-')[1],
                    'toYear' : week.get('to_date').split('-')[0]}

            values = {'queryString' : ' '.join(search_terms),
                      'fromDay' : dateSplitDict['fromDay'],
                      'fromMonth' : dateSplitDict['fromMonth'],
                      'fromYear' : dateSplitDict['fromYear'],
                      'toDay' : dateSplitDict['toDay'],
                      'toMonth' : dateSplitDict['toMonth'],
                      'toYear' : dateSplitDict['toYear'],
                      'sectionCorriere' : 'true',
                      'queryMode' : query_mode}
 
            data = urllib.urlencode(values)
            req = urllib2.Request(url, data)
            response = urllib2.urlopen(req)
            the_page = response.read()
            soup = BeautifulSoup(the_page)
            hits = soup.find(class_="count-search-results").find("strong").get_text(strip=True).encode('utf-8')

            print('Hits: ' + hits)

            enterResult(db, hits, table_name, week['from_date'])

            sleep(randint(50,70))

    except Exception, e:
            print str(e)
            return

    return


def queryDbForMissingDates(db, table_name):

    try:
        conn = sqlite3.connect(db)
        cursor = conn.cursor()
        sql1 = "SELECT from_date, to_date FROM %s WHERE status IS NOT 'OK'" % table_name
        cursor.execute(sql1)
    except Exception,e:
        print str(e)


    query_list = []
    
    for (from_date, to_date) in cursor.fetchall():
        query_list.append({'from_date':from_date.encode("utf-8"),
                           'to_date':to_date.encode("utf-8")})

    return(query_list)


def enterResult (db, hits, table_name, from_date):

    hits = hits.replace(',', '')

    if RepresentsInt(hits):
        try:
            conn = sqlite3.connect(db)
            cursor = conn.cursor()
            sql1 = "UPDATE %s SET hits = %s, status = 'OK' WHERE from_date = '%s'" % (table_name, hits, from_date)
            cursor.execute(sql1)
            conn.commit()
        except Exception, e:
            print str(e)

    else:
        pass

    return
    

def RepresentsInt(s):
    try: 
        int(s)
        return True
    except ValueError:
        return False
         

def createTable(db, start_date, source, search_terms, table_name):

    conn = sqlite3.connect(db)
    tb_exists = "SELECT name FROM sqlite_master WHERE type='table' AND name = '%s'" % (table_name)
    
    if not conn.execute(tb_exists).fetchone():
        try:
            cursor = conn.cursor()
            sql1 = """CREATE TABLE %s (
            from_date DATETIME PRIMARY KEY,
            to_date DATETIME,
            status INTEGER,
            source CHAR DEFAULT "%s",
            search_terms CHAR DEFAULT "%s",
            hits INTEGER,
            timestamp DATETIME DEFAULT (CURRENT_TIMESTAMP)
            );""" % (table_name, source, ' '.join(search_terms))
            cursor.execute(sql1)

            from_date = start_date
            # for n in range(1204):
            for n in range(25):
                to_date = from_date + datetime.timedelta(days=6)
                sql2 = "INSERT INTO %s (from_date, to_date) VALUES ('%s', '%s')" % (table_name, from_date.strftime('%Y-%m-%d'), to_date.strftime('%Y-%m-%d'))
                cursor.execute(sql2)
                from_date = from_date + datetime.timedelta(days=7)

            conn.commit()

        except Exception,e:
            print str(e)

    return

def returnClosestPriorDate (dict, dateEventX):
    
    list = []

    for date in dict.keys():
        list.append(datetime.datetime.strptime(date, "%Y-%m-%d"))

    list = sorted(list)

    for index in range(len(list)-1):
        if  (dateEventX >= list[index] and dateEventX < list[index+1]):
            for value, key in dict.iteritems():
                if datetime.datetime.strptime(value, "%Y-%m-%d") == list[index]:
                    return key

                
def fillTableWithPolFigure (db, source, who, table_name):

    conn = sqlite3.connect(db)
    tb_exists = "SELECT name FROM sqlite_master WHERE type='table' AND name = '%s'" % (table_name)

    if not conn.execute(tb_exists).fetchone():
        try:
            cursor = conn.cursor()
            sql1 = """CREATE TABLE %s (
            from_date DATETIME PRIMARY KEY,
            to_date DATETIME,
            status INTEGER,
            source CHAR DEFAULT "%s",
            search_terms CHAR,
            hits INTEGER,
            timestamp DATETIME DEFAULT (CURRENT_TIMESTAMP)
            );""" % (table_name, source)
            cursor.execute(sql1)

            # PresCon dictionary
            with open('prescon_name_date.csv', mode='r') as infile:
                    reader = csv.reader(infile)
                    presCon = {rows[1]:rows[0] for rows in reader}

                # SegOpp dictionary
            with open('segopp_name_date.csv', mode='r') as infile:
                    reader = csv.reader(infile)
                    segOpp = {rows[1]:rows[0] for rows in reader}

            from_date = start_date
            
            for n in range(1204):
                to_date = from_date + datetime.timedelta(days=6)

                if who == ["PresCon"]:
                    who_name = returnClosestPriorDate (presCon, from_date)
                if who == ["SegOpp"]:
                    who_name = returnClosestPriorDate (segOpp, from_date)
                    
                sql2 = "INSERT INTO %s (from_date, to_date, search_terms) VALUES ('%s', '%s', '%s')" % (table_name, from_date.strftime('%Y-%m-%d'), to_date.strftime('%Y-%m-%d'), who_name)
                cursor.execute(sql2)
                from_date = from_date + datetime.timedelta(days=7)

            conn.commit()


        except Exception,e:
            print str(e)

    

    
